
# Bibliothèques de base et calcul
import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.stats import t, shapiro
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels
import math
# Bibliothèques de representation graphique
import matplotlib.pyplot as plt
import seaborn as sns
# Bibliothèques pour la regression lineaire
from statsmodels.formula.api import ols
from statsmodels.stats.diagnostic import het_white , normal_ad
from sklearn.linear_model import LinearRegression
# Options globales: arrondir float pour l'output display
pd.options.display.float_format = "{:.3f}".format #leigth est en .000
import warnings
warnings.filterwarnings('ignore')
# Import des donnees:
billets_df = pd.read_csv("datas/billets.csv", sep = ';') #Training set
# Séparation des données: Création des dataframe
df_valide = billets_df.dropna() # on a beta puisque on supprime les na ==> trouver l'équation
df_na = billets_df[billets_df.isnull().any(axis=1)] # on applique l'équation pour remplacer les na
billets_df.info()
billets_df.describe(include="all")
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1500 entries, 0 to 1499 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 is_genuine 1500 non-null bool 1 diagonal 1500 non-null float64 2 height_left 1500 non-null float64 3 height_right 1500 non-null float64 4 margin_low 1463 non-null float64 5 margin_up 1500 non-null float64 6 length 1500 non-null float64 dtypes: bool(1), float64(6) memory usage: 71.9 KB
| is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | |
|---|---|---|---|---|---|---|---|
| count | 1500 | 1500.000 | 1500.000 | 1500.000 | 1463.000 | 1500.000 | 1500.000 |
| unique | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
| top | True | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | 1000 | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | NaN | 171.958 | 104.030 | 103.920 | 4.486 | 3.151 | 112.679 |
| std | NaN | 0.305 | 0.299 | 0.326 | 0.664 | 0.232 | 0.873 |
| min | NaN | 171.040 | 103.140 | 102.820 | 2.980 | 2.270 | 109.490 |
| 25% | NaN | 171.750 | 103.820 | 103.710 | 4.015 | 2.990 | 112.030 |
| 50% | NaN | 171.960 | 104.040 | 103.920 | 4.310 | 3.140 | 112.960 |
| 75% | NaN | 172.170 | 104.230 | 104.150 | 4.870 | 3.310 | 113.340 |
| max | NaN | 173.010 | 104.880 | 104.950 | 6.900 | 3.910 | 114.440 |
# On cherche les valeurs manquantes
billets_df.isnull().sum()
is_genuine 0 diagonal 0 height_left 0 height_right 0 margin_low 37 margin_up 0 length 0 dtype: int64
Il y a 37 valeurs manquantes pour la variable margin_low
# Intanciation de la régression linéaire multiple
reg_multi = smf.ols('margin_low~is_genuine+diagonal+height_left+height_right+margin_up+length', data=df_valide).fit()
print(reg_multi.summary())
OLS Regression Results
==============================================================================
Dep. Variable: margin_low R-squared: 0.617
Model: OLS Adj. R-squared: 0.615
Method: Least Squares F-statistic: 390.7
Date: Tue, 06 Sep 2022 Prob (F-statistic): 4.75e-299
Time: 13:35:46 Log-Likelihood: -774.14
No. Observations: 1463 AIC: 1562.
Df Residuals: 1456 BIC: 1599.
Df Model: 6
Covariance Type: nonrobust
======================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------
Intercept 2.8668 8.316 0.345 0.730 -13.445 19.179
is_genuine[T.True] -1.1406 0.050 -23.028 0.000 -1.238 -1.043
diagonal -0.0130 0.036 -0.364 0.716 -0.083 0.057
height_left 0.0283 0.039 0.727 0.468 -0.048 0.105
height_right 0.0267 0.038 0.701 0.484 -0.048 0.102
margin_up -0.2128 0.059 -3.621 0.000 -0.328 -0.098
length -0.0039 0.023 -0.166 0.868 -0.050 0.042
==============================================================================
Omnibus: 21.975 Durbin-Watson: 2.038
Prob(Omnibus): 0.000 Jarque-Bera (JB): 37.993
Skew: 0.061 Prob(JB): 5.62e-09
Kurtosis: 3.780 Cond. No. 1.95e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.95e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
On applique la fonction pour trouver le model optimal par algorithme "backward"
from functions_RL import *
columns = ['is_genuine','margin_low','diagonal','height_left','height_right','margin_up','length']
reg_backward = backward_selected(df_valide[columns], 'margin_low')
_______________________________
margin_low ~ length + margin_up + height_right + height_left + is_genuine + diagonal + 1
remove length (p-value : 0.868 )
_______________________________
margin_low ~ margin_up + height_right + height_left + is_genuine + diagonal + 1
remove diagonal (p-value : 0.719 )
_______________________________
margin_low ~ margin_up + height_right + height_left + is_genuine + 1
remove height_right (p-value : 0.496 )
_______________________________
margin_low ~ margin_up + height_left + is_genuine + 1
remove height_left (p-value : 0.454 )
_______________________________
margin_low ~ margin_up + is_genuine + 1
is the final model !
OLS Regression Results
==============================================================================
Dep. Variable: margin_low R-squared: 0.617
Model: OLS Adj. R-squared: 0.616
Method: Least Squares F-statistic: 1174.
Date: Tue, 06 Sep 2022 Prob (F-statistic): 1.24e-304
Time: 13:35:47 Log-Likelihood: -774.73
No. Observations: 1463 AIC: 1555.
Df Residuals: 1460 BIC: 1571.
Df Model: 2
Covariance Type: nonrobust
======================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------
Intercept 5.9263 0.198 30.003 0.000 5.539 6.314
is_genuine[T.True] -1.1632 0.029 -40.477 0.000 -1.220 -1.107
margin_up -0.2119 0.059 -3.612 0.000 -0.327 -0.097
==============================================================================
Omnibus: 22.365 Durbin-Watson: 2.041
Prob(Omnibus): 0.000 Jarque-Bera (JB): 39.106
Skew: 0.057 Prob(JB): 3.22e-09
Kurtosis: 3.793 Cond. No. 65.0
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Le meilleur modele pour la régression Linéaire est : 'margin_low ~ is_genuine + margin_up + 1'
# On fait la regression linaire (Méthode des moindres carrés)
reg_multi = smf.ols('margin_low ~ is_genuine + margin_up + 1', data=df_valide).fit()
print(reg_multi.summary())
OLS Regression Results
==============================================================================
Dep. Variable: margin_low R-squared: 0.617
Model: OLS Adj. R-squared: 0.616
Method: Least Squares F-statistic: 1174.
Date: Tue, 06 Sep 2022 Prob (F-statistic): 1.24e-304
Time: 13:35:47 Log-Likelihood: -774.73
No. Observations: 1463 AIC: 1555.
Df Residuals: 1460 BIC: 1571.
Df Model: 2
Covariance Type: nonrobust
======================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------
Intercept 5.9263 0.198 30.003 0.000 5.539 6.314
is_genuine[T.True] -1.1632 0.029 -40.477 0.000 -1.220 -1.107
margin_up -0.2119 0.059 -3.612 0.000 -0.327 -0.097
==============================================================================
Omnibus: 22.365 Durbin-Watson: 2.041
Prob(Omnibus): 0.000 Jarque-Bera (JB): 39.106
Skew: 0.057 Prob(JB): 3.22e-09
Kurtosis: 3.793 Cond. No. 65.0
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# On réalise les tests à un niveau 𝛼=5 % :
alpha = 0.05
# On récvupère 𝑛, le nombre d'individus de l'échantillon, et𝑝, le nombre de variables.
n = df_valide.shape[0]
p = 4
X = df_valide[['is_genuine','diagonal','height_left','height_right','margin_up','length']]
# Création d'une colonne contenant les prédiction
df_valide["margin_low_pred"] = reg_multi.predict(X)
# Creation d'une colonne contenant les residus:
df_valide["residual"] = df_valide["margin_low"] - df_valide["margin_low_pred"]
df_valide
| is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | margin_low_pred | residual | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | True | 171.810 | 104.860 | 104.950 | 4.520 | 2.890 | 112.830 | 4.151 | 0.369 |
| 1 | True | 171.460 | 103.360 | 103.660 | 3.770 | 2.990 | 113.090 | 4.129 | -0.359 |
| 2 | True | 172.690 | 104.480 | 103.500 | 4.400 | 2.940 | 113.160 | 4.140 | 0.260 |
| 3 | True | 171.360 | 103.910 | 103.940 | 3.620 | 3.010 | 113.510 | 4.125 | -0.505 |
| 4 | True | 171.730 | 104.280 | 103.460 | 4.040 | 3.480 | 112.540 | 4.026 | 0.014 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1495 | False | 171.750 | 104.380 | 104.170 | 4.420 | 3.090 | 111.280 | 5.271 | -0.851 |
| 1496 | False | 172.190 | 104.630 | 104.440 | 5.270 | 3.370 | 110.970 | 5.212 | 0.058 |
| 1497 | False | 171.800 | 104.010 | 104.120 | 5.510 | 3.360 | 111.950 | 5.214 | 0.296 |
| 1498 | False | 172.060 | 104.280 | 104.060 | 5.170 | 3.460 | 112.250 | 5.193 | -0.023 |
| 1499 | False | 171.470 | 104.150 | 103.820 | 4.630 | 3.370 | 112.070 | 5.212 | -0.582 |
1463 rows × 9 columns
Nous allons mener des analyses sur les valeurs atypiques et/ou influentes en travaillant sur un dataframe appelé analyses.
analyses = pd.DataFrame({'obs':np.arange(1, n+1)})
On peut calculer les leviers comme ceci, en sachant que le seuil des leviers est de $2∗\frac{p}{n}$.
analyses['levier'] = reg_multi.get_influence().hat_matrix_diag
seuil_levier = 2*p/n
plt.figure(figsize=(10,6))
plt.bar(analyses['obs'], analyses['levier'])
plt.xticks(np.arange(0, 1500, step=5))
plt.xlabel('Observation')
plt.ylabel('Leviers')
plt.plot([0, 1500], [seuil_levier, seuil_levier], color='r')
plt.show()
# Pour sélectionner les points pour lesquels le levier est supérieur au seuil :
# IAI = Individus Atypique et ou Influents
iai = analyses.loc[analyses['levier'] > seuil_levier, :]
print(iai)
len(iai)
obs levier 48 49 0.007 52 53 0.011 123 124 0.006 376 377 0.006 649 650 0.013 655 656 0.006 774 775 0.006 779 780 0.006 952 953 0.006 1000 1001 0.008 1167 1168 0.006 1235 1236 0.005 1238 1239 0.006 1248 1249 0.005 1277 1278 0.006
15
# La moyenne des résidus doit être nulle et son ecart type fixé de malière déterministe
reg_multi.resid.mean()
4.503422773298379e-15
Si l'on souhaite maintenant calculer les résidus studentisés, nous écrivons ceci, sachant que le seuil pour les résidus studentisés est une loi de Student à n-p-1 degrés de liberté :
analyses['rstudent'] = reg_multi.get_influence().resid_studentized_internal
seuil_rstudent = t.ppf(1-alpha/2,n-p-1)
plt.figure(figsize=(10,6))
plt.bar(analyses['obs'], analyses['rstudent'])
plt.xticks(np.arange(0, 1500, step=5))
plt.xlabel('Observation')
plt.ylabel('Résidus studentisés')
plt.plot([0, 1500], [seuil_rstudent, seuil_rstudent], color='r')
plt.plot([0, 1500], [-seuil_rstudent, -seuil_rstudent], color='r')
plt.show()
influence = reg_multi.get_influence().summary_frame()
Le seuil de la distance de Cook est de n-p.
analyses['dcooks'] = influence['cooks_d']
seuil_dcook = 4/(n-p)
# On peut détecter les observations influentes comme ceci :
plt.figure(figsize=(10,6))
plt.bar(analyses['obs'], analyses['dcooks'])
plt.xticks(np.arange(0, 1500, step=5))
plt.xlabel('Observation')
plt.ylabel('Leviers')
plt.plot([0, 1500], [seuil_dcook, seuil_dcook], color='r')
plt.show()
On ne retire des points qu'après avoir vérifié qu'ils sont effectivement atypiques, voire aberrants, au vu du modèle estimé.
Une autre chose à vérifier est l'éventuelle colinéarité approchée des variables :
On ne peut pas calculer le VIF car pour avoir une matrice on doit supprimer is_genuine qui est un booleen donc ile ne resterai qu'une variable explicative: margin_low
variables = reg_multi.model.exog
[variance_inflation_factor(variables, i) for i in np.arange(1,variables.shape[1])]
[1.5938854494007753, 1.5938854494007746]
Ici, tous les coefficients sont inférieurs à 10, il n'y a donc pas de problème de colinéarité.
On peut également tester l’homoscédasticité (c'est-à-dire la constance de la variance) des résidus :
_, pval, __, f_pval = statsmodels.stats.diagnostic.het_breuschpagan(reg_multi.resid, variables)
print('p value test Breusch Pagan:', pval)
p value test Breusch Pagan: 3.2033559115856614e-36
# Homoscedasticite:
white_test = het_white(reg_multi.resid, reg_multi.model.exog)
labels_white_test = ["Test Statistic", "p-value", "F-Statistic", "F-Test p-value"]
print(dict(zip(labels_white_test, white_test)))
# Graphique:
plt.subplots(figsize=(8, 4))
plt.scatter(x=df_valide.index, y=df_valide["residual"], alpha=0.8)
plt.plot(np.repeat(0, len(df_valide.index)+2), color="black", linestyle='--')
plt.title("Homoscédasticité")
{'Test Statistic': 166.92896597830813, 'p-value': 4.769905016347682e-35, 'F-Statistic': 46.94619855077706, 'F-Test p-value': 3.7024551237815678e-37}
Text(0.5, 1.0, 'Homoscédasticité')
https://fr.wikipedia.org/wiki/Test_de_White table khi2 : https://archimede.mat.ulaval.ca/stt1920/STT-1920-Loi-du-khi-deux.pdf
Si l'on veut tester la normalité des résidus, on peut faire un test de Shapiro-Wilk.
shapiro(reg_multi.resid)
ShapiroResult(statistic=0.9936248064041138, pvalue=6.20942773821298e-06)
Ici, l'hypothèse de normalité est remise en cause (p-value = 6.20942773821298e-06 < 0.05).
Néanmoins, l'observation des résidus, le fait qu'ils ne soient pas très différents d'une distribution symétrique, et le fait que l'échantillon soit de taille suffisante (supérieure à 30) permettent de dire que les résultats obtenus par le modèle linéaire gaussien ne sont pas absurdes, même si le résidu n'est pas considéré comme étant gaussien.
# Creation d'une colonne contenant les residus:
df_valide["residual"] = df_valide["margin_low"] - df_valide["margin_low_pred"]
# Normalite:
print("La p-value du test d'Aderson-Darling vaux", normal_ad(df_valide["residual"])[1])
# Graphique:
sns.histplot(df_valide["residual"], kde=True)
plt.xlabel("Résidus")
plt.ylabel("Compte")
plt.title("Distribution des résidus")
plt.show()
La p-value du test d'Aderson-Darling vaux 4.8613414248955766e-05
billets_df.head()
| is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | |
|---|---|---|---|---|---|---|---|
| 0 | True | 171.810 | 104.860 | 104.950 | 4.520 | 2.890 | 112.830 |
| 1 | True | 171.460 | 103.360 | 103.660 | 3.770 | 2.990 | 113.090 |
| 2 | True | 172.690 | 104.480 | 103.500 | 4.400 | 2.940 | 113.160 |
| 3 | True | 171.360 | 103.910 | 103.940 | 3.620 | 3.010 | 113.510 |
| 4 | True | 171.730 | 104.280 | 103.460 | 4.040 | 3.480 | 112.540 |
# qqplot
sm.qqplot(reg_multi.resid)
= erreur de prediction = residus = ÿ-y Après avoir estimé un modèle de régression linéaire, il faut ensuite analyser :
Normalité de la distribution des résidus :
Test de normalité des résidus :
Test de Durbin-Watson : test statistique destiné à tester l'autocorrélation des résidus dans un modèle de régression linéaire. H0 : autocorrelation si p=0 or p = 2.041 => pas autocorrelation mais >1 ==> test de Box Pierce
X = df_na[['is_genuine','diagonal','height_left','height_right','margin_up','length']]
pred = reg_multi.predict(X)
print(pred)
72 4.074 99 4.098 151 4.123 197 3.996 241 4.123 251 4.138 284 4.078 334 4.127 410 4.102 413 4.093 445 4.125 481 4.189 505 4.131 611 4.064 654 4.191 675 4.220 710 4.112 739 4.108 742 4.076 780 4.100 798 4.125 844 4.129 845 4.148 871 4.112 895 4.131 919 4.216 945 4.123 946 4.070 981 4.129 1076 5.246 1121 5.254 1176 5.286 1303 5.305 1315 5.206 1347 5.161 1435 5.172 1438 5.240 dtype: float64
# affectation des prédictions au na du df billets_df
billets_df.loc[billets_df.margin_low.isna(),'margin_low']=pred
# Vérification de l'imputation par regréssion linéaire
billets_df.isna().sum()
is_genuine 0 diagonal 0 height_left 0 height_right 0 margin_low 0 margin_up 0 length 0 dtype: int64
billets_df.describe()
| diagonal | height_left | height_right | margin_low | margin_up | length | |
|---|---|---|---|---|---|---|
| count | 1500.000 | 1500.000 | 1500.000 | 1500.000 | 1500.000 | 1500.000 |
| mean | 171.958 | 104.030 | 103.920 | 4.483 | 3.151 | 112.679 |
| std | 0.305 | 0.299 | 0.326 | 0.660 | 0.232 | 0.873 |
| min | 171.040 | 103.140 | 102.820 | 2.980 | 2.270 | 109.490 |
| 25% | 171.750 | 103.820 | 103.710 | 4.027 | 2.990 | 112.030 |
| 50% | 171.960 | 104.040 | 103.920 | 4.310 | 3.140 | 112.960 |
| 75% | 172.170 | 104.230 | 104.150 | 4.870 | 3.310 | 113.340 |
| max | 173.010 | 104.880 | 104.950 | 6.900 | 3.910 | 114.440 |
df_valide
| is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | margin_low_pred | residual | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | True | 171.810 | 104.860 | 104.950 | 4.520 | 2.890 | 112.830 | 4.151 | 0.369 |
| 1 | True | 171.460 | 103.360 | 103.660 | 3.770 | 2.990 | 113.090 | 4.129 | -0.359 |
| 2 | True | 172.690 | 104.480 | 103.500 | 4.400 | 2.940 | 113.160 | 4.140 | 0.260 |
| 3 | True | 171.360 | 103.910 | 103.940 | 3.620 | 3.010 | 113.510 | 4.125 | -0.505 |
| 4 | True | 171.730 | 104.280 | 103.460 | 4.040 | 3.480 | 112.540 | 4.026 | 0.014 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1495 | False | 171.750 | 104.380 | 104.170 | 4.420 | 3.090 | 111.280 | 5.271 | -0.851 |
| 1496 | False | 172.190 | 104.630 | 104.440 | 5.270 | 3.370 | 110.970 | 5.212 | 0.058 |
| 1497 | False | 171.800 | 104.010 | 104.120 | 5.510 | 3.360 | 111.950 | 5.214 | 0.296 |
| 1498 | False | 172.060 | 104.280 | 104.060 | 5.170 | 3.460 | 112.250 | 5.193 | -0.023 |
| 1499 | False | 171.470 | 104.150 | 103.820 | 4.630 | 3.370 | 112.070 | 5.212 | -0.582 |
1463 rows × 9 columns
plt.figure(figsize=(8,4), dpi=100)
plt.plot(df_valide["margin_low"],c='darkblue')
plt.plot(df_valide["margin_low_pred"],c='red')
plt.title("Visualisation de la prediction de margin_Low par régression linéaire",fontsize=13,loc='center')
plt.xlabel("Billets")
plt.ylabel("Margin_low")
plt.savefig('img/Visualisation de régression lineaire.png')
plt.show()
billets_df.info()
billets_df.describe(include="all")
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1500 entries, 0 to 1499 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 is_genuine 1500 non-null bool 1 diagonal 1500 non-null float64 2 height_left 1500 non-null float64 3 height_right 1500 non-null float64 4 margin_low 1500 non-null float64 5 margin_up 1500 non-null float64 6 length 1500 non-null float64 dtypes: bool(1), float64(6) memory usage: 71.9 KB
| is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | |
|---|---|---|---|---|---|---|---|
| count | 1500 | 1500.000 | 1500.000 | 1500.000 | 1500.000 | 1500.000 | 1500.000 |
| unique | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
| top | True | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | 1000 | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | NaN | 171.958 | 104.030 | 103.920 | 4.483 | 3.151 | 112.679 |
| std | NaN | 0.305 | 0.299 | 0.326 | 0.660 | 0.232 | 0.873 |
| min | NaN | 171.040 | 103.140 | 102.820 | 2.980 | 2.270 | 109.490 |
| 25% | NaN | 171.750 | 103.820 | 103.710 | 4.027 | 2.990 | 112.030 |
| 50% | NaN | 171.960 | 104.040 | 103.920 | 4.310 | 3.140 | 112.960 |
| 75% | NaN | 172.170 | 104.230 | 104.150 | 4.870 | 3.310 | 113.340 |
| max | NaN | 173.010 | 104.880 | 104.950 | 6.900 | 3.910 | 114.440 |
# On cherche les valeurs manquantes
billets_df.isnull().sum()
is_genuine 0 diagonal 0 height_left 0 height_right 0 margin_low 0 margin_up 0 length 0 dtype: int64
print("Nb billet true\n" ,billets_df.loc[billets_df["is_genuine"]==True].count(),"\n")
print("Nb billet false\n" ,billets_df.loc[billets_df["is_genuine"]==False].count())
Nb billet true is_genuine 1000 diagonal 1000 height_left 1000 height_right 1000 margin_low 1000 margin_up 1000 length 1000 dtype: int64 Nb billet false is_genuine 500 diagonal 500 height_left 500 height_right 500 margin_low 500 margin_up 500 length 500 dtype: int64
billets_df.groupby("is_genuine").count().T
| is_genuine | False | True |
|---|---|---|
| diagonal | 500 | 1000 |
| height_left | 500 | 1000 |
| height_right | 500 | 1000 |
| margin_low | 500 | 1000 |
| margin_up | 500 | 1000 |
| length | 500 | 1000 |
plt.style.use('seaborn-whitegrid')
data_genuine = billets_df.groupby("is_genuine").count()
data_genuine.plot.pie(y="diagonal",
figsize=(5, 5),explode = [0, 0.1],
labels = ["Faux billets", "Vrais billets"],
textprops={'fontsize':14},
autopct = '%1.2f%%',
pctdistance = 0.4, labeldistance = 0.7)
plt.title('Répartition des vrais et faux billets du Dataset', loc='center', fontsize=14)
plt.savefig('img/billets_repartition_vrai_faux_dfComplet.png')
plt.show()
# Tendance centrale
billets_df.groupby(by="is_genuine").mean().T
| is_genuine | False | True |
|---|---|---|
| diagonal | 171.901 | 171.987 |
| height_left | 104.190 | 103.949 |
| height_right | 104.144 | 103.809 |
| margin_low | 5.216 | 4.116 |
| margin_up | 3.350 | 3.052 |
| length | 111.631 | 113.202 |
# Tendance centrale et dispersion
billets_df.groupby(by="is_genuine").median().T
| is_genuine | False | True |
|---|---|---|
| diagonal | 171.910 | 171.990 |
| height_left | 104.180 | 103.950 |
| height_right | 104.160 | 103.810 |
| margin_low | 5.195 | 4.112 |
| margin_up | 3.350 | 3.050 |
| length | 111.630 | 113.205 |
# Variance empirique. Mesurer la dispersion der la variance
billets_df.groupby(by='is_genuine').var().T
| is_genuine | False | True |
|---|---|---|
| diagonal | 0.094 | 0.090 |
| height_left | 0.050 | 0.090 |
| height_right | 0.073 | 0.085 |
| margin_low | 0.302 | 0.099 |
| margin_up | 0.033 | 0.035 |
| length | 0.379 | 0.129 |
Les variables concernant margin_low sont faiblement impactées
billets_df.groupby(by='is_genuine').var().sum()
diagonal 0.184 height_left 0.140 height_right 0.158 margin_low 0.400 margin_up 0.067 length 0.508 dtype: float64
Variance plus élevée pour les variables : margin_low (0.400) et length (0.508)
plt.figure(figsize=(15,15), dpi=100) # taille
sns.set(style="dark")
plt.subplot(3,2,1)
plt.title('Distribution - analyse univariée')
sns.histplot(data=billets_df, x="diagonal", hue="is_genuine", bins=50, kde = True)
plt.subplot(3,2,2)
plt.title('Distribution - analyse univariée')
sns.histplot(data=billets_df, x="height_left", hue="is_genuine", bins=50, kde = True)
plt.title('Distribution - analyse univariée')
plt.subplot(3,2,3)
plt.title('Distribution - analyse univariée')
sns.histplot(data=billets_df, x="height_right", hue="is_genuine", bins=50, kde = True)
plt.subplot(3,2,4)
plt.title('Distribution - analyse univariée')
sns.histplot(data=billets_df, x="margin_low", hue="is_genuine", bins=50, kde = True)
plt.title('Distribution - analyse univariée')
plt.subplot(3,2,5)
plt.title('Distribution - analyse univariée')
sns.histplot(data=billets_df, x="margin_up", hue="is_genuine", bins=50, kde = True)
plt.subplot(3,2,6)
plt.title('Distribution - analyse univariée')
sns.histplot(data=billets_df, x="length", hue="is_genuine", bins=50, kde = True)
plt.subplots_adjust(left=0.125,
bottom=0.1,
right=0.9,
top=0.9,
wspace=0.2,
hspace=0.35)
plt.savefig('img/Analyse_univariee_distribution_dfComplet.jpg')
plt.show()
def var_hist(var, i):
subset = billets_df[var]
n_df_valide = len(df_valide)
xbar = np.mean(df_valide[var]) # Moyenne
sprime = np.std(df_valide[var], ddof=1) # Ecart-type
sprime2 = np.var(df_valide[var], ddof=1) #Variance non biaisée
ax = fig.add_subplot(i)
ax.hist(subset, density=True)
ax.axvline(xbar, color='r', linewidth=2, label="Moyenne empirique")
bins = np.arange(df_valide[var].min(),df_valide[var].max(),0.05)
y = st.norm.pdf(bins, xbar, sprime)
ax.plot(bins, y, '--', label="Densité normale")
ax.legend()
ax.set_xlabel(var, fontsize=12)
ax.set_ylabel('Densité', fontsize=12)
ax.set_title('Distribution de '+str(var), fontsize=18)
liste_var = ["diagonal","height_left","height_right","margin_low","margin_up","length"]
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,30),constrained_layout=False)
i = 321
for var in liste_var :
var_hist(var, i)
i+=1
plt.savefig("img/analyse univariee histo_dfComplet.jpg")
# Representation graphique des outliers:
a = 2 # nombre de lignes
b = 3 # nombre de colonnes
c = 1 # initialisation
fig = plt.figure(figsize=(20,8))
for i in billets_df.loc[:, billets_df.columns != "is_genuine"]: # pour toute les colonnnes quantatives
plt.subplot(a, b, c) # maillage des subplot
plt.title('{} (boxplot)'.format(i, a, b, c))# titres des box plot
plt.xlabel(i) # xlabel = nom de la colonne
sns.boxplot(x = billets_df[i]) # faire un boxplot sns
c = c + 1 # incrementation ==> création d'un nouveau box plot
plt.subplots_adjust(left=0.125, # gerer les espacements
bottom=0.1,
right=0.9,
top=0.9,
wspace=0.2,
hspace=0.35)
def var_boxplot(var, i):
ax = fig.add_subplot(i)
ax = sns.boxplot(x="is_genuine", y=var, data=billets_df)
plt.xlabel('Vrai / faux billets')
plt.ylabel(var)
plt.title('Boxplot de la variable '+str(var)+' en fonction du type de billet', color="blue", size=16)
liste_var = ["height_left","height_right","margin_low","margin_up","length","diagonal"]
fig = plt.figure(figsize=(20,30),constrained_layout=False)
i = 321
for var in liste_var :
var_boxplot(var, i)
i+=1
plt.savefig("img/analyse univariee boxplot_dfComplet.jpg")
mycolumns = ['diagonal','height_left','height_right','margin_low','margin_up','length']
pd.plotting.scatter_matrix(billets_df[mycolumns],figsize=(12,12))
plt.savefig("img/analyse bivariee_scatter_dfComplet.jpg")
sns.pairplot(billets_df,hue='is_genuine',palette=None,height=2.5)
plt.savefig("img/analyse bivariee_paiplot_dfComplet.jpg")
sns.pairplot(billets_df,hue='is_genuine',palette=None,height=2.5, kind="hist")
plt.savefig("img/analyse bivariee_pairplotHist_dfComplet.jpg")
sns.pairplot(billets_df,hue='is_genuine',palette=None,height=2.5, kind="kde")
plt.savefig("img/analyse bivariee_pairplotKde_dfComplet.jpg")
# Heatmap
matrice_corr = billets_df.corr().round(1)
sns.heatmap(data=matrice_corr, annot=True,cmap='coolwarm')
plt.savefig("img/analyse bivariee_heatmap_dfComplet.jpg")
# export
billets_df.to_csv('datas/billets_complet.csv',index=False)
# test = pd.read_csv("datas/billets_complet.csv", sep = ',')
# test